# Generic many-core configuration file, loosely inspired by the Intel Xeon Phi (Knights Corner/KNC)
# Has many small-ish cores, private L1/L2, directory-based coherency, 2-D mesh network, high memory bandwidth
# Requires -n 64, update network/emesh_hop_by_hop/size to support other core counts

[general]
enable_icache_modeling = true

[perf_model/core]
frequency = 1.0
logical_cpus = 1 # number of SMT threads per core
type = interval
core_model = nehalem

[perf_model/core/interval_timer]
dispatch_width = 2
window_size = 32

[perf_model/sync]
reschedule_cost = 1000

[caching_protocol]
type = parametric_dram_directory_msi

[perf_model/branch_predictor]
type = pentium_m
mispredict_penalty=5 # From microarchitecture.pdf (Nehalem has a longer pipeline than Core2)

[perf_model/cache]
levels = 2

[perf_model/l1_icache]
perfect = false
cache_size = 32
associativity = 4
address_hash = mask
replacement_policy = lru
data_access_time = 3
tags_access_time = 1
perf_model_type = parallel
writethrough = 0
shared_cores = 1

[perf_model/l1_dcache]
perfect = false
cache_size = 32
associativity = 8
address_hash = mask
replacement_policy = lru
data_access_time = 3
tags_access_time = 1
perf_model_type = parallel
writethrough = 0
shared_cores = 1

[perf_model/l2_cache]
perfect = false
cache_size = 512
associativity = 8
address_hash = mask
replacement_policy = lru
data_access_time = 22
tags_access_time = 5
writeback_time = 1
perf_model_type = parallel
writethrough = 0
shared_cores = 1

[perf_model/dram_directory]
# total_entries = number of entries per directory controller.
total_entries = 1048576
associativity = 64
directory_type = full_map
locations = llc

[perf_model/dram]
num_controllers = 8
latency = 80
per_controller_bandwidth = 32

[perf_model/dram/queue_model]
enabled = true
type = windowed_mg1

[network]
memory_model_1 = emesh_hop_by_hop

[network/emesh_hop_by_hop]
hop_latency = 4            # Per-hop latency in core cycles
link_bandwidth = 256       # Per-link, per-direction bandwidth in bits/cycle
dimensions = 2             # Mesh dimensions (1 for line/ring, 2 for mesh/torus)
wrap_around = false        # Has wrap-around links? (false for line/mesh, true for ring/torus)
concentration = 1          # Number of cores per network interface (must be >= last-level-cache/shared_cores)
size = 16:4

[network/emesh_hop_by_hop/queue_model]
enabled = true
type = windowed_mg1

[scheduler/pinned]
quantum = 100              # Scheduler quantum (round-robin for active threads on each core), in nanoseconds
                           # Shortest possible quantum (one barrier quantum) configured, so we can use this to mimick SMT

[dvfs]
transition_latency = 2000 # In ns, "under 2 microseconds" according to http://download.intel.com/design/intarch/papers/323671.pdf (page 8)

[dvfs/simple]
cores_per_socket = 1

[power]
vdd = 1.05 # Volts
technology_node = 22 # nm
